#-*- coding:utf-8 -*-
#本脚本用于抓取链家小区id，存在文件里，然后再用下个脚本迭代这些小区id，做进一步的抓取

import requests;
from lxml import etree;
import time;
import re;
import uuid;

header={'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1'};

#使用了代理
proxies={
	"http":"http://139.224.135.94:80",	
	"http":"http://110.72.43.198:8123",	
	"http":"http://121.205.254.170:39581",	
	"https":"http://49.81.248.234:8118",	
};

start_url='https://m.lianjia.com/cd/xiaoqu/pg';
id_partten=re.compile(r'.*/(\d{10,25})/');

myfile=open('14xiaoqu_id.txt','a');

#爬取小区列表，迭代每一页
for i in range(1,101):
	print('正在抓取第%d页'%i);
	url=start_url+str(i);
	html=requests.get(url,headers=header);
	time.sleep(1);
	selector=etree.HTML(html.text);
	#所有小区信息，都是存在 li 中的，获取所有的li
	xiaoquList=selector.xpath('//li[@class="pictext"]');
	#迭代本页所有小区
	for xiaoqu in xiaoquList:
		href=xiaoqu.xpath('a[1]/@href')[0];
		result=re.match(id_partten,href);
		xiaoqu_id=str(result.group(1));
		print(xiaoqu_id);
		myfile.write(xiaoqu_id+'\n');

myfile.close();		
		
		

	